This is transcript-level analysis.
The original FASTQ files (50 bp, single-end, reverse stranded) were processed with kallisto using a custom transcriptome, which is a combination of the REFSEQ mRNA sequences AND RepBase D. melanogaster TE seqeunces (refGene_plus_repbase)
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL25 YL25_shW_RNAseq_1st.fq.gz
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL26 YL26_shSmt3_RNAseq_1st.fq.gz
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL27 YL27_shW_RNAseq_2nd.fq.gz
./kallisto quant –single -t 4 -l 200 -s 50 -b 30 –rf-stranded -i transcriptome_plus_repbase/refGene_plus_repbase -o YL28 YL28_shSmt3_RNAseq_2nd.fq.gz
Next, we use these files for sleuth analysis.
s2c<-data.frame(sample=c("YL25","YL27","YL26","YL28"),condition=c("ctrl", "ctrl","KD","KD"),
path=c("../kallisto/YL25/","../kallisto/YL27/", "../kallisto/YL26/","../kallisto/YL28/"))
s2c$path<-as.character(s2c$path)
print(s2c)
## sample condition path
## 1 YL25 ctrl ../kallisto/YL25/
## 2 YL27 ctrl ../kallisto/YL27/
## 3 YL26 KD ../kallisto/YL26/
## 4 YL28 KD ../kallisto/YL28/
# #this is to create a transcript to gene name dictionary - I just transform each fasta header. It is only necessary if using gene-based aggregation
# t<-read.table("../kallisto/YL26/abundance.tsv", header=T, as.is=T)
# t2g<-data.frame("target_id"=t$target_id,
# gene=sub(":CDS.*","",sub(".*gene=","", t$target_id)))
# t2g$coding<-grepl("CDS", t2g$target_id)
# t2g$TE<-grepl("\\|", t2g$target_id)
# # number of elements of different type: with CDS annotation, RepBase entries (TE), and none (ncRNAs or genes with no CDS)
# t2g %>% select(gene,coding, TE) %>% distinct() %>% group_by(coding, TE) %>% tally()
# make "so"
so <- sleuth_prep(s2c, ~condition )#, target_mapping = t2g, extra_bootstrap_summary=T, read_bootstrap_tpm = TRUE, aggregation_column = 'gene', gene_mode=TRUE)
## reading in kallisto results
## dropping unused factor levels
## ....
## normalizing est_counts
## 18141 targets passed the filter
## normalizing tpm
## merging in metadata
## summarizing bootstraps
## ....
Here, we can inspect the input data table.
units
## [1] "est_counts"
table1<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = units) %>% as.data.frame() %>%
rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28)
table1 %>% round(digits=2) %>% datatable(options=list(pageLength=5))
## Warning in instance$preRenderHook(instance): It seems your data is too big for client-side DataTables. You may consider server-side
## processing: https://rstudio.github.io/DT/server.html
Running sleuth Likelihood Ratio test (LRT), and displaying a table of significantly changed genes, qval<0.05. This table DOES NOT have information about up- or down- regulation.
so <- sleuth_fit(so, ~condition,'full')
so <- sleuth_fit(so, ~1, 'reduced')
so <- sleuth_lrt(so, 'reduced', 'full')
sleuth_table <- sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE)
dplyr::filter(sleuth_table, qval <= 0.05) %>% datatable(options = list(pageLength=5))
Running wald test & plotting MA plot from these analysis (note that the y-axis is the beta value).
Note: the displayed table shows all results, however the MA plot does not show genes which did not pass filters, and Inf values.
so <- sleuth_wt(so, 'conditionKD')
results_table_wt <- sleuth_results(so, 'conditionKD', show_all = TRUE)
results_table_wt %>% datatable(options = list(pageLength=5))
p<-sleuth::plot_ma(so, "conditionKD", sig_color="green4", sig_level = 0.05)
p + theme_classic() + ggtitle("MA plot of genes and TEs, sleuth") + geom_text(data=(results_table_wt %>% filter(str_detect(target_id,"(smt3|^R1_DM|^R2_DM)"))), aes(mean_obs, b, label=sub(".*gene=(.*):CDS.*", "\\1",sub("_DM.*","",target_id))), position = position_nudge(x=0.5))

Fig. MA plot from sleuth analysis, Wald test. All genes with qvalue<0.05 are highlighted (warning: beta != log2FC)
Below it is the same MA plot, but I only marked genes with >2 fold change, calculated from the original values in the sleuth table above. Note: Some values that are Inf cannot be displayed.
table.srbp<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = units) %>% as.data.frame() %>% rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28) %>% tibble::rownames_to_column("target_id") %>% mutate(x=(shW.1+shW.2)/2, y=(shSmt3.1+shSmt3.2)/2) %>% mutate(log2FC=log2(y/x))
res<-sleuth_results(so, 'conditionKD', show_all = FALSE)
res<-dplyr::left_join(res,table.srbp, by="target_id") %>% mutate(signFC=(qval<0.05 & abs(log2FC)>1), significant=qval<0.05) %>% arrange(qval)
p<-ggplot(res, aes(mean_obs, b)) + theme_classic() + ggtitle("MA plot of genes and TEs, sleuth") +
geom_point(aes(colour = signFC), alpha = 0.2) +
scale_colour_manual(values = c("black", "green4")) +
xlab(paste("mean( log(counts + 0.5 ) )")) +
ylab(paste0("beta: conditionKD"))
p + geom_text(data=(res %>% filter(str_detect(target_id,"(smt3|^R1_DM|^R2_DM)"))), aes(mean_obs, b, label=sub(".*gene=(.*):CDS.*", "\\1",sub("_DM.*","",target_id))), position = position_nudge(x=0.5))

Fig. MA plot from sleuth analysis, Wald test. (warning: beta != log2FC). Only genes with qval<0.05 & log2FC (from kallisto numbers) are highlighted.
This is an alternative visualization of the data, where we do a scatter plot in control vs knockdown. We can choose to select the dots based on qvalues either from the Wald test, or the LTR test. The test choice does not change the interpretation of the data in the light of the R1, R2 up-regulation compared to genes.
### values tables
table.tpm<-sleuth_to_matrix(so, which_df = "obs_norm", which_units = "tpm") %>% as.data.frame() %>% rename(shW.1=YL25, shW.2=YL27, shSmt3.1=YL26, shSmt3.2=YL28) %>% tibble::rownames_to_column("target_id") %>% mutate(x=(shW.1+shW.2)/2, y=(shSmt3.1+shSmt3.2)/2) %>% mutate(log2FC=log2(y/x))
tab.raw<-left_join(table.srbp, table.tpm, by="target_id",suffix=c(".srpb",".tpm"))
### stats tables
tab.wald<-sleuth_results(so, 'conditionKD', show_all = TRUE) %>% arrange(target_id) # wald test
tab.lrt<-sleuth_results(so, 'reduced:full', test_type = 'lrt', show_all = TRUE) %>% arrange(target_id) #lrt test
tab.stat<-left_join(tab.wald, tab.lrt, by="target_id", suffix=c(".wald",".lrt"))
### all data together fold change is taken from the est_counts (or scaled reads per base for gene analysis)
tab.plot<-left_join(tab.stat, tab.raw, by="target_id") %>% mutate(signWald=(abs(log2FC.srpb)>1 & qval.wald<0.05), signLRT=(abs(log2FC.srpb)>1 & qval.lrt<0.05))
### plot
p.sub<-geom_text(data=(tab.plot %>% filter(str_detect(target_id,"(smt3|^R1_DM|^R2_DM)"))), aes(log2(x.tpm+1), log2(y.tpm+1), label=sub(".*gene=(.*):CDS.*", "\\1",sub("_DM.*","",target_id))), position = position_nudge(x=1))
top20.x<-tail(sort(tab.plot$x.tpm),20)[1]
top20.y<-tail(sort(tab.plot$y.tpm),20)[1]
p<-ggplot(tab.plot, aes(log2(x.tpm+1), log2(y.tpm+1))) + theme_classic() + theme(legend.position="left") + p.sub
p<-p + scale_color_manual(values=c("black", "beige", "green4"))
p<-p + xlab(paste("shW, log2 (av.tpm + 1) ")) + ylab(paste("shSmt3, log2 (av.tpm + 1) "))
p<-p + geom_hline(yintercept = log2(top20.y+1), linetype="dotted") + geom_vline(xintercept = log2(top20.x+1), linetype="dotted")
p1<-p + geom_point(aes(colour = paste(signWald)), alpha = 0.4) + geom_abline(intercept = 0, slope=1, col="blue")
p2<-p + geom_point(aes(colour = paste(signLRT)), alpha = 0.4) + geom_abline(intercept = 0, slope=1, col="blue")
p3<-ggExtra::ggMarginal(p1, type="histogram", size=3, fill = "grey70")
p4<-ggExtra::ggMarginal(p2, type="histogram", size=3, fill = "grey70")
gridExtra::grid.arrange(p3, p4, nrow=1)

Figure. Scatter plots of gene expression in control vs KD. The dotted line denotes area that contains the top 20 most highly expressed genes in the corresponding condition. Values log2FC>2 and significant: qvalue (Wald or LRT) < 0.05 are highlighted. The histograms above the y-axis and x-axis show the distribution of values. Blue line shows 1:1 ratio.
Note: Here, pale dots show values (“NA”) which did not pass various sleuth filters and should be ignored. Can be removed by “show_all=FALSE”.
First we prepare the data in a convenient table of all.
tab.all<-tab.plot %>% mutate("filt_out"=is.na(b), "type"=as.factor(ifelse(str_detect(target_id, "\\|"),"TE","RefSeq")))
Number of RepBase (TE) and other (protein-coding genes,ncRNA, etc) entries from RefSeq (all genes) analyzed; some were filtered out because they didn’t pass sleuth criteria (usually very low read counts or large discrepancy between reps).
tab.all %>% group_by(type, filt_out) %>% tally() %>% arrange(filt_out)
## # A tibble: 4 x 3
## # Groups: type [2]
## type filt_out n
## <fct> <lgl> <int>
## 1 RefSeq FALSE 17975
## 2 TE FALSE 166
## 3 RefSeq TRUE 15330
## 4 TE TRUE 72
Number of significantly changed genes (qval 0.05, LRT or Walt Test; logFC (onscaled_reads_per_base)):
#Wald test
tab.all %>% filter(abs(log2FC.srpb)>1, qval.wald<0.05) %>% mutate("change"=ifelse(log2FC.srpb>0,"up", "down")) %>% group_by(type,change) %>% tally() %>% arrange(desc(change))
## # A tibble: 4 x 3
## # Groups: type [2]
## type change n
## <fct> <chr> <int>
## 1 RefSeq up 187
## 2 TE up 36
## 3 RefSeq down 159
## 4 TE down 2
#LRT
tab.all %>% filter(abs(log2FC.srpb)>1, qval.lrt<0.05) %>% mutate("change"=ifelse(log2FC.srpb>0,"up", "down")) %>% group_by(type,change) %>% tally() %>% arrange(desc(change))
## # A tibble: 4 x 3
## # Groups: type [2]
## type change n
## <fct> <chr> <int>
## 1 RefSeq up 56
## 2 TE up 23
## 3 RefSeq down 64
## 4 TE down 2
From 17975 RefSeq transcripts and 166 RebBase entries which passed the initial filters, 187(56) and 36(23) are significantly up-regulated more than 2-fold in smt3 KD, and 159(64) or 2 are down-regulated more than 2-fold (qvalue 0.05, Wald test or LTR test).
Therefore, smt3 KD led to >2 fold up-regulation for ~30% of the TE-s and ~4% of the annotated genes (protein-coding and ncRNAs), and down-regulation of ~1.5% of the annotated genes.
R1 and R2 - from relatively lowly expressed genes in the control - become among the top 20 most abundant products in SUMO KD. : see below ***
Although several hundred genes change, it is evident from the plots mostly modestly expressed genes that show the large FC, except for R1 and R2. If we filter genes with over 100 times change in expression log2FC>=~6.64, significant in at least one test:
tab.all %>% filter(log2FC.srpb>log2(100)) %>% filter(qval.wald<0.05 | qval.lrt<0.05) %>%
select(target_id, shW.1.srpb, shW.2.srpb, shSmt3.1.srpb, shSmt3.2.srpb, log2FC.srpb, type, b, qval.wald, qval.lrt) %>% mutate_at(2:5, funs(round(.,3))) %>% arrange(-( shSmt3.1.srpb*0.5+shSmt3.2.srpb*0.5)) %>% datatable(options=list(pageLength=5))
Results show that all other genes, apart from R1 and R2 in this set, are ones that practically have no reads in the control, and just few reads in the KD.
Numbers below show the xpression of R1 and R2 elements; including rank of the average values. “X” and “Y” are average per control and KD, respectively:
Note: Values can be seen as transcripts per million (TPM), see https://haroldpimentel.wordpress.com/2014/05/08/what-the-fpkm-a-review-rna-seq-expression-units/ , or scaled_reads_per_base (gene mode only): The average number of reads mapping to each base across the whole gene
tab.all.ranks1<-tab.all %>% filter(!filt_out) %>% select(target_id, matches("sh.*srpb"),matches("(x|y).srpb")) %>% mutate(ctrl.rank.srpb=rank(desc(x.srpb)), kd.rank.srpb=rank(desc(y.srpb)))
tab.all.ranks2<-tab.all %>% filter(!filt_out) %>% select(target_id, matches("sh.*tpm"),matches("(x|y).tpm")) %>% mutate(ctrl.rank.tpm=rank(desc(x.tpm)), kd.rank.tpm=rank(desc(y.tpm)))
tab.all.ranks1 %>% filter(str_detect(target_id, "^R[12]_"))
## target_id shW.1.srpb shSmt3.1.srpb shW.2.srpb shSmt3.2.srpb x.srpb y.srpb ctrl.rank.srpb kd.rank.srpb
## 1 R1_DM|Drosophila_fruit_fly_genus 239.2480 348313.7 243.4254 201125.2 241.3367 274719.5 9076 1
## 2 R2_DM|Drosophila_fruit_fly_genus 629.5107 194856.4 621.1112 197785.8 625.3109 196321.1 5702 4
tab.all.ranks2 %>% filter(str_detect(target_id, "^R[12]_"))
## target_id shW.1.tpm shSmt3.1.tpm shW.2.tpm shSmt3.2.tpm x.tpm y.tpm ctrl.rank.tpm kd.rank.tpm
## 1 R1_DM|Drosophila_fruit_fly_genus 2.740584 4013.985 2.790111 2315.011 2.765347 3164.498 11553 18
## 2 R2_DM|Drosophila_fruit_fly_genus 10.911782 3397.955 10.772657 3444.922 10.842219 3421.439 7863 16
Notably, R1 and R2 change from relatively lowly expressed elements, to the top 20 most abundant transcripts (in TPM). No other elements shows such a dramatic increase.
I’d like to thank Lynn Yi for the greatly helpful tips on using kallisto and sleuth.